# Import Dependencies
%matplotlib inline
# Start Python Imports
import math, time, random, datetime
# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Let's be rebels and ignore warnings for now
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
from IPython.display import HTML
from IPython.display import Image
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)
display(HTML("<style>.container { width:1600px !important; }</style>"))
# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
The following dataset was dowloaded from Kaggle
df_22 = pd.read_csv('Combined_Flights_2022.csv') ## Flights 2022
df_airlines = pd.read_csv('Airlines.csv') ## Flights from 2017-2022
df_22['Operating_Airline'].value_counts()
WN 731925 DL 512114 AA 495953 OO 440807 UA 352009 YX 197881 B6 156793 MQ 150758 9E 142624 OH 135884 AS 129510 NK 129208 F9 86557 G4 73504 YV 71972 QX 56320 PT 52827 C5 44367 HA 42212 ZW 40300 G7 34793 Name: Operating_Airline, dtype: int64
airline_counts = df_22['Operating_Airline'].value_counts()
airline_counts.index = airline_counts.index.map(df_airlines.set_index('Code')['Description']).rename('Airline')
airline_counts = airline_counts.sort_values(ascending=True).to_frame("Count").reset_index()
df_22['Cancelled'] = df_22['Cancelled'].astype('bool')
airline_counts
| Airline | Count | |
|---|---|---|
| 0 | GoJet Airlines, LLC d/b/a United Express | 34793 |
| 1 | Air Wisconsin Airlines Corp | 40300 |
| 2 | Hawaiian Airlines Inc. | 42212 |
| 3 | Commutair Aka Champlain Enterprises, Inc. | 44367 |
| 4 | Capital Cargo International | 52827 |
| 5 | Horizon Air | 56320 |
| 6 | Mesa Airlines Inc. | 71972 |
| 7 | Allegiant Air | 73504 |
| 8 | Frontier Airlines Inc. | 86557 |
| 9 | Spirit Air Lines | 129208 |
| 10 | Alaska Airlines Inc. | 129510 |
| 11 | Comair Inc. | 135884 |
| 12 | Endeavor Air Inc. | 142624 |
| 13 | Envoy Air | 150758 |
| 14 | JetBlue Airways | 156793 |
| 15 | Republic Airlines | 197881 |
| 16 | United Air Lines Inc. | 352009 |
| 17 | SkyWest Airlines Inc. | 440807 |
| 18 | American Airlines Inc. | 495953 |
| 19 | Delta Air Lines Inc. | 512114 |
| 20 | Southwest Airlines Co. | 731925 |
px.bar(
airline_counts,
y='Airline',
x='Count',
orientation='h',
template='plotly_dark',
title='Total Flights During 2022 Per Airline',
)
## Filters all the cancelled flights
df_22.query('Cancelled').head()
| FlightDate | Airline | Origin | Dest | Cancelled | Diverted | CRSDepTime | DepTime | DepDelayMinutes | DepDelay | ... | WheelsOff | WheelsOn | TaxiIn | CRSArrTime | ArrDelay | ArrDel15 | ArrivalDelayGroups | ArrTimeBlk | DistanceGroup | DivAirportLandings | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 949 | 2022-04-02 | Commutair Aka Champlain Enterprises, Inc. | JAX | IAH | True | False | 1842 | NaN | NaN | NaN | ... | NaN | NaN | NaN | 2031 | NaN | NaN | NaN | 2000-2059 | 4 | 0 |
| 970 | 2022-04-02 | Commutair Aka Champlain Enterprises, Inc. | IAH | JAX | True | False | 1435 | NaN | NaN | NaN | ... | NaN | NaN | NaN | 1757 | NaN | NaN | NaN | 1700-1759 | 4 | 0 |
| 982 | 2022-04-02 | Commutair Aka Champlain Enterprises, Inc. | IAH | ECP | True | False | 1426 | NaN | NaN | NaN | ... | NaN | NaN | NaN | 1611 | NaN | NaN | NaN | 1600-1659 | 3 | 0 |
| 1001 | 2022-04-02 | Commutair Aka Champlain Enterprises, Inc. | ECP | IAH | True | False | 1650 | NaN | NaN | NaN | ... | NaN | NaN | NaN | 1855 | NaN | NaN | NaN | 1800-1859 | 3 | 0 |
| 1451 | 2022-04-01 | GoJet Airlines, LLC d/b/a United Express | ITH | EWR | True | False | 700 | NaN | NaN | NaN | ... | NaN | NaN | NaN | 824 | NaN | NaN | NaN | 0800-0859 | 1 | 0 |
5 rows × 61 columns
pct_cancelled = df_22['Cancelled'].mean()*100
print(f'Of all flights in this DataSet {pct_cancelled:0.2f}% are Cancelled')
Of all flights in this DataSet 3.02% are Cancelled
df_22['Cancelled'].value_counts()
False 3955126 True 123192 Name: Cancelled, dtype: int64
cancelled_counts = df_22.query('Year == 2022 and Cancelled').groupby(['Operating_Airline'])['Cancelled'].agg(['count','sum'])
cancelled_counts.head()
| count | sum | |
|---|---|---|
| Operating_Airline | ||
| 9E | 5819 | 5819 |
| AA | 18736 | 18736 |
| AS | 3738 | 3738 |
| B6 | 7631 | 7631 |
| C5 | 1518 | 1518 |
cancel = df_22.query('Year == 2022').groupby(['Operating_Airline','Cancelled']).size().unstack().reset_index()
cancel.head()
| Cancelled | Operating_Airline | False | True |
|---|---|---|---|
| 0 | 9E | 136805 | 5819 |
| 1 | AA | 477217 | 18736 |
| 2 | AS | 125772 | 3738 |
| 3 | B6 | 149162 | 7631 |
| 4 | C5 | 42849 | 1518 |
cancel.index = cancel.index.rename(None)
cancel.rename(columns={False: 'Not Cancelled', True: 'Cancelled'}).rename_axis(None, axis=1).head()
| Operating_Airline | Not Cancelled | Cancelled | |
|---|---|---|---|
| 0 | 9E | 136805 | 5819 |
| 1 | AA | 477217 | 18736 |
| 2 | AS | 125772 | 3738 |
| 3 | B6 | 149162 | 7631 |
| 4 | C5 | 42849 | 1518 |
cancel2 = (
(
df_22.query('Year == 2022')
.groupby(['Operating_Airline','Cancelled'])
.size()
.unstack()
.reset_index()
)
.rename(columns={False: 'Not Cancelled', True: 'Cancelled'})
.rename_axis(None, axis=1)
)
cancel2.head()
| Operating_Airline | Not Cancelled | Cancelled | |
|---|---|---|---|
| 0 | 9E | 136805 | 5819 |
| 1 | AA | 477217 | 18736 |
| 2 | AS | 125772 | 3738 |
| 3 | B6 | 149162 | 7631 |
| 4 | C5 | 42849 | 1518 |
cancel3 = cancel2.assign(Total=cancel2["Not Cancelled"] + cancel2['Cancelled'])
cancel3.head()
| Operating_Airline | Not Cancelled | Cancelled | Total | |
|---|---|---|---|---|
| 0 | 9E | 136805 | 5819 | 142624 |
| 1 | AA | 477217 | 18736 | 495953 |
| 2 | AS | 125772 | 3738 | 129510 |
| 3 | B6 | 149162 | 7631 | 156793 |
| 4 | C5 | 42849 | 1518 | 44367 |
cancel4 = cancel3.assign(pct_cancelled=(cancel3["Cancelled"] / cancel3['Total'])*100)
cancel4.head()
| Operating_Airline | Not Cancelled | Cancelled | Total | pct_cancelled | |
|---|---|---|---|---|---|
| 0 | 9E | 136805 | 5819 | 142624 | 4.079958 |
| 1 | AA | 477217 | 18736 | 495953 | 3.777777 |
| 2 | AS | 125772 | 3738 | 129510 | 2.886264 |
| 3 | B6 | 149162 | 7631 | 156793 | 4.866926 |
| 4 | C5 | 42849 | 1518 | 44367 | 3.421462 |
airline_map = df_airlines.set_index('Code')['Description']
airline_map
Code
02Q Titan Airways
04Q Tradewind Aviation
05Q Comlux Aviation, AG
06Q Master Top Linhas Aereas Ltd.
07Q Flair Airlines Ltd.
...
ZW Air Wisconsin Airlines Corp
ZX Air Georgian
ZX (1) Airbc Ltd.
ZY Atlantic Gulf Airlines
ZYZ Skyway Aviation Inc.
Name: Description, Length: 1571, dtype: object
cancel['Airline'] = cancel["Operating_Airline"].map(airline_map)
cancel['Airline']
0 Endeavor Air Inc. 1 American Airlines Inc. 2 Alaska Airlines Inc. 3 JetBlue Airways 4 Commutair Aka Champlain Enterprises, Inc. 5 Delta Air Lines Inc. 6 Frontier Airlines Inc. 7 Allegiant Air 8 GoJet Airlines, LLC d/b/a United Express 9 Hawaiian Airlines Inc. 10 Envoy Air 11 Spirit Air Lines 12 Comair Inc. 13 SkyWest Airlines Inc. 14 Capital Cargo International 15 Horizon Air 16 United Air Lines Inc. 17 Southwest Airlines Co. 18 Mesa Airlines Inc. 19 Republic Airlines 20 Air Wisconsin Airlines Corp Name: Airline, dtype: object
cancel5 = cancel4.assign(pct_cancelled=(cancel4["Cancelled"] / cancel4['Total'])*100).assign(Airline=cancel4["Operating_Airline"].map(airline_map))
cancel5.head().head()
| Operating_Airline | Not Cancelled | Cancelled | Total | pct_cancelled | Airline | |
|---|---|---|---|---|---|---|
| 0 | 9E | 136805 | 5819 | 142624 | 4.079958 | Endeavor Air Inc. |
| 1 | AA | 477217 | 18736 | 495953 | 3.777777 | American Airlines Inc. |
| 2 | AS | 125772 | 3738 | 129510 | 2.886264 | Alaska Airlines Inc. |
| 3 | B6 | 149162 | 7631 | 156793 | 4.866926 | JetBlue Airways |
| 4 | C5 | 42849 | 1518 | 44367 | 3.421462 | Commutair Aka Champlain Enterprises, Inc. |
cancel5.set_index('Airline').sort_values('Cancelled')['Cancelled'].plot(kind='barh', figsize=(10,5), title="Cancelation by Airline 2022")
<AxesSubplot:title={'center':'Cancelation by Airline 2022'}, ylabel='Airline'>
fig, ax = plt.subplots(figsize=(10,10))
bars = (
cancel5.set_index('Airline').sort_values('pct_cancelled')['pct_cancelled'].plot(
kind='barh',
figsize=(10,5),
title="% Cancelation by Airline 2022",
width=0.7,
edgecolor='black'
))
ax.bar_label(ax.containers[0], fmt='%0.2f%%', padding=-40, color='white')
plt.show()
df_22['DelayGroup'] = None
df_22.loc[df_22['DepDelayMinutes'] == 0, 'DelayGroup'] = 'OnTime_Early'
df_22.loc[(df_22['DepDelayMinutes'] > 0) & (df_22['DepDelayMinutes'] <= 15), 'DelayGroup'] = 'Small_Delay'
df_22.loc[(df_22['DepDelayMinutes'] > 15) & (df_22['DepDelayMinutes'] <= 45), 'DelayGroup'] = 'Medium_Delay'
df_22.loc[df_22['DepDelayMinutes'] > 45, 'DelayGroup'] = 'Large_Delay'
df_22.loc[df_22['Cancelled'], 'DelayGroup'] = 'Cancelled'
# Generally 0-15 min delays are considered not significant.
# Usually it can be made up for with an earlier arrival,
df_22['DelayGroup'].value_counts()
OnTime_Early 2398098 Small_Delay 722122 Medium_Delay 445619 Large_Delay 389287 Cancelled 123192 Name: DelayGroup, dtype: int64
pal = sns.color_palette()
df_22['DelayGroup'].value_counts(ascending=True).plot(kind='barh', color=pal[5], title='Delay Category', figsize=(10,5))
<AxesSubplot:title={'center':'Delay Category'}>
print((df_22['DelayGroup'].value_counts() / df_22.shape[0] * 100).round(2))
print(sum(df_22['DelayGroup'].value_counts() / df_22.shape[0] * 100))
OnTime_Early 58.80 Small_Delay 17.71 Medium_Delay 10.93 Large_Delay 9.55 Cancelled 3.02 Name: DelayGroup, dtype: float64 99.99999999999999
plt.figure(figsize=(8,6))
sns.countplot(data=df_22, x='DelayGroup')
plt.suptitle('Delay Groups')
Text(0.5, 0.98, 'Delay Groups')
!pip install Pyppeteer
!pyppeteer-install
!pip install Flask Jinja2
Collecting Pyppeteer Using cached pyppeteer-1.0.2-py3-none-any.whl (83 kB) Collecting pyee<9.0.0,>=8.1.0 Using cached pyee-8.2.2-py2.py3-none-any.whl (12 kB) Requirement already satisfied: certifi>=2021 in ./env/lib/python3.10/site-packages (from Pyppeteer) (2022.9.24) Collecting importlib-metadata>=1.4 Downloading importlib_metadata-5.1.0-py3-none-any.whl (21 kB) Collecting appdirs<2.0.0,>=1.4.3 Using cached appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB) Collecting tqdm<5.0.0,>=4.42.1 Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB) Collecting websockets<11.0,>=10.0 Using cached websockets-10.4-cp310-cp310-macosx_10_9_x86_64.whl (97 kB) Requirement already satisfied: urllib3<2.0.0,>=1.25.8 in ./env/lib/python3.10/site-packages (from Pyppeteer) (1.26.11) Collecting zipp>=0.5 Using cached zipp-3.10.0-py3-none-any.whl (6.2 kB) Installing collected packages: pyee, appdirs, zipp, websockets, tqdm, importlib-metadata, Pyppeteer Successfully installed Pyppeteer-1.0.2 appdirs-1.4.4 importlib-metadata-5.1.0 pyee-8.2.2 tqdm-4.64.1 websockets-10.4 zipp-3.10.0 chromium is already installed. Collecting Flask Using cached Flask-2.2.2-py3-none-any.whl (101 kB) Requirement already satisfied: Jinja2 in ./env/lib/python3.10/site-packages (3.0.3) Collecting click>=8.0 Using cached click-8.1.3-py3-none-any.whl (96 kB) Collecting Werkzeug>=2.2.2 Using cached Werkzeug-2.2.2-py3-none-any.whl (232 kB) Collecting itsdangerous>=2.0 Using cached itsdangerous-2.1.2-py3-none-any.whl (15 kB) Requirement already satisfied: MarkupSafe>=2.0 in ./env/lib/python3.10/site-packages (from Jinja2) (2.1.1) Installing collected packages: Werkzeug, itsdangerous, click, Flask Successfully installed Flask-2.2.2 Werkzeug-2.2.2 click-8.1.3 itsdangerous-2.1.2